# code to load packages
library(tidyverse)
library(tidymodels)
library(knitr)
library(dplyr)
library(ggplot2)
library(patchwork)
library(corrplot)
#baseR
auto <- read_csv("data/autodatawg.csv")
New names:
• `notes` -> `notes...23`
• `notes` -> `notes...61`
• `` -> `...99`
Rows: 583 Columns: 123
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (109): Collector, Higher taxon, Taxon, aagroup, miss mC, miss mT, miss ...
dbl (2): radius L, Ulna L
lgl (12): m2-5 av, hpp2-5 av, hmp2-5 av, hdp2-5 av, notes...61, fpp2-5 av, ...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Set hms as numeric
auto$hm1 <- as.numeric(as.character(auto$hm1))
Warning: NAs introduced by coercion
auto$hm2 <- as.numeric(as.character(auto$hm2))
Warning: NAs introduced by coercion
auto$hm3 <- as.numeric(as.character(auto$hm3))
Warning: NAs introduced by coercion
auto$hm4 <- as.numeric(as.character(auto$hm4))
Warning: NAs introduced by coercion
auto$hm5 <- as.numeric(as.character(auto$hm5))
Warning: NAs introduced by coercion
#Set hpps as numeric
auto$hpp1 <- as.numeric(as.character(auto$hpp1))
Warning: NAs introduced by coercion
auto$hpp2 <- as.numeric(as.character(auto$hpp2))
Warning: NAs introduced by coercion
auto$hpp3 <- as.numeric(as.character(auto$hpp3))
Warning: NAs introduced by coercion
auto$hpp4 <- as.numeric(as.character(auto$hpp4))
Warning: NAs introduced by coercion
auto$hpp5 <- as.numeric(as.character(auto$hpp5))
Warning: NAs introduced by coercion
#remove NAs
autoclean <- auto %>%
filter(complete.cases(hm1, hm2, hm3, hm4, hm5, hpp1, hpp2, hpp3, hpp4, hpp5))
#avg hand metatarsals for each higher taxa
hm_avg <- autoclean |>
group_by(aagroup) |>
summarize(average_hm1 = mean(hm1, na.rm = TRUE),
average_hm2 = mean(hm2, na.rm = TRUE),
average_hm3 = mean(hm3, na.rm = TRUE),
average_hm4 = mean(hm4, na.rm = TRUE),
average_hm5 = mean(hm5, na.rm = TRUE)
)
print(hm_avg)
#avg proximal phalanges for each higher taxa
hpp_avg <- autoclean |>
group_by(aagroup) |>
summarize(average_hpp1 = mean(hpp1, na.rm = TRUE),
average_hpp2 = mean(hpp2, na.rm = TRUE),
average_hpp3 = mean(hpp3, na.rm = TRUE),
average_hpp4 = mean(hpp4, na.rm = TRUE),
average_hpp5 = mean(hpp5, na.rm = TRUE)
)
print(hpp_avg)
#get total digit lengths
autoclean$total_digit1 <- autoclean$hm1 + autoclean$hpp1
autoclean$total_digit2 <- autoclean$hm2 + autoclean$hpp2
autoclean$total_digit3 <- autoclean$hm3 + autoclean$hpp3
autoclean$total_digit4 <- autoclean$hm4 + autoclean$hpp4
autoclean$total_digit5 <- autoclean$hm5 + autoclean$hpp5
head(autoclean)
NA
#AVG DIGIT LENGTHS
avg_digit_lengths <- hm_avg |>
full_join(hpp_avg, by = "aagroup") |>
mutate(
avg_digit1 = hm_avg$average_hm1 + hpp_avg$average_hpp1,
avg_digit2 = hm_avg$average_hm2 + hpp_avg$average_hpp2,
avg_digit3 = hm_avg$average_hm3 + hpp_avg$average_hpp3,
avg_digit4 = hm_avg$average_hm4 + hpp_avg$average_hpp4,
avg_digit5 = hm_avg$average_hm5 + hpp_avg$average_hpp5
) |>
select(aagroup, avg_digit1, avg_digit2, avg_digit3, avg_digit4, avg_digit5)
avg_digit_lengths
#EXPLORING COVARIATION
# BIPLOTS FOR ALL DIGITS IN DATASET
#to calc rsquared
r_squared <- function(x, y) {
model <- lm(y ~ x)
summary(model)$r.squared
}
pairs <- list(
c("total_digit1", "total_digit2"),
c("total_digit1", "total_digit3"),
c("total_digit1", "total_digit4"),
c("total_digit1", "total_digit5"),
c("total_digit2", "total_digit3"),
c("total_digit2", "total_digit4"),
c("total_digit2", "total_digit5"),
c("total_digit3", "total_digit4"),
c("total_digit3", "total_digit5"),
c("total_digit4", "total_digit5")
)
for (pair in pairs) {
x <- autoclean[[pair[1]]]
y <- autoclean[[pair[2]]]
r_squared <- calculate_r_squared(x, y)
p <- ggplot(autoclean, aes_string(x = pair[1], y = pair[2])) +
geom_point() +
labs(x = pair[1], y = pair[2], title = paste("Biplot of", pair[1], "and", pair[2]), subtitle = paste("R^2 =", r_squared)) +
theme_minimal()
print(p)
}
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.
#CORRELATION MATRIX
correlation_matrix <- cor(autoclean[, c("total_digit1", "total_digit2", "total_digit3", "total_digit4", "total_digit5")])
print(correlation_matrix)
total_digit1 total_digit2 total_digit3 total_digit4 total_digit5
total_digit1 1.0000000 0.9566183 0.9565192 0.9634344 0.9664044
total_digit2 0.9566183 1.0000000 0.9791062 0.9725032 0.9686648
total_digit3 0.9565192 0.9791062 1.0000000 0.9941848 0.9786560
total_digit4 0.9634344 0.9725032 0.9941848 1.0000000 0.9928006
total_digit5 0.9664044 0.9686648 0.9786560 0.9928006 1.0000000
corrplot(correlation_matrix, method = "color", addCoef.col = "black", tl.col = "black", tl.srt = 45)
# Function to calculate R^2
calc_r_squared <- function(x, y) {
model <- lm(y ~ x)
summary(model)$r.squared
}
# Create pairs of digit lengths
pairs <- list(
c("total_digit1", "total_digit2"),
c("total_digit1", "total_digit3"),
c("total_digit1", "total_digit4"),
c("total_digit1", "total_digit5"),
c("total_digit2", "total_digit3"),
c("total_digit2", "total_digit4"),
c("total_digit2", "total_digit5"),
c("total_digit3", "total_digit4"),
c("total_digit3", "total_digit5"),
c("total_digit4", "total_digit5")
)
# Get unique taxa
unique_taxa <- unique(autoclean$aagroup)
# for each higher taxa
for (taxon in unique_taxa) {
# filter for the current higher taxon
taxa_data <- autoclean[autoclean$aagroup == taxon, ]
# going thru each digit pair
for (pair in pairs) {
x <- taxa_data[[pair[1]]]
y <- taxa_data[[pair[2]]]
# get r^2
r_squared <- calc_r_squared(x, y)
# actual plot
p <- ggplot(taxa_data, aes_string(x = pair[1], y = pair[2])) +
geom_point(color = "#0096FF", alpha = 0.6, size = 3) +
labs(
x = pair[1],
y = pair[2],
title = paste("Biplot of", pair[1], "and", pair[2], "for", taxon),
subtitle = paste("R^2 =", round(r_squared, 3))
) +
theme_minimal() +
facet_wrap(~ `Sex/age`) +
theme(
plot.title = element_text(size = 12, face = "bold", color = "#0096FF"),
plot.subtitle = element_text(size = 10, color = "black"),
axis.title.x = element_text(size = 10, face = "bold"),
axis.title.y = element_text(size = 10, face = "bold"),
panel.background = element_rect(fill = "#FFF1F3"),
panel.grid.major = element_line(color = "white"),
panel.grid.minor = element_line(color = "white")
)
print(p)
}
}
# Function to calculate R^2
calc_r_squared <- function(x, y) {
model <- lm(y ~ x)
summary(model)$r.squared
}
# Create pairs of digit lengths
pairs <- list(
c("total_digit1", "total_digit2"),
c("total_digit1", "total_digit3"),
c("total_digit1", "total_digit4"),
c("total_digit1", "total_digit5"),
c("total_digit2", "total_digit3"),
c("total_digit2", "total_digit4"),
c("total_digit2", "total_digit5"),
c("total_digit3", "total_digit4"),
c("total_digit3", "total_digit5"),
c("total_digit4", "total_digit5")
)
# Get unique taxa
unique_taxa <- unique(autoclean$aagroup)
# for each higher taxa
for (taxon in unique_taxa) {
# filter for the current higher taxon
taxa_data <- autoclean[autoclean$aagroup == taxon, ]
# going thru each digit pair
for (pair in pairs) {
x <- taxa_data[[pair[1]]]
y <- taxa_data[[pair[2]]]
# get r^2
r_squared <- calc_r_squared(x, y)
# actual plot
p <- ggplot(taxa_data, aes_string(x = pair[1], y = pair[2])) +
geom_point(color = "#0096FF", alpha = 0.6, size = 3) +
labs(
x = pair[1],
y = pair[2],
title = paste("Biplot of", pair[1], "and", pair[2], "for", taxon),
subtitle = paste("R^2 =", round(r_squared, 3))
) +
theme_minimal() +
theme(
plot.title = element_text(size = 12, face = "bold", color = "#0096FF"),
plot.subtitle = element_text(size = 10, color = "black"),
axis.title.x = element_text(size = 10, face = "bold"),
axis.title.y = element_text(size = 10, face = "bold"),
panel.background = element_rect(fill = "#FFF1F3"),
panel.grid.major = element_line(color = "white"),
panel.grid.minor = element_line(color = "white")
)
print(p)
}
}
NA
# Get unique taxa
unique_taxa <- unique(autoclean$aagroup)
# for each higher taxa
for (taxon in unique_taxa) {
# filter for the current higher taxon
taxa_data <- autoclean[autoclean$aagroup == taxon, ]
#empty matrix
r_squared_matrix <- matrix(NA, nrow = 5, ncol = 5)
rownames(r_squared_matrix) <- colnames(r_squared_matrix) <- c("total_digit1", "total_digit2", "total_digit3", "total_digit4", "total_digit5")
for (pair in pairs) {
x <- taxa_data[[pair[1]]]
y <- taxa_data[[pair[2]]]
# get r^2
r_squared <- calc_r_squared(x, y)
# Store R^2 in the matrix
r_squared_matrix[pair[1], pair[2]] <- r_squared
r_squared_matrix[pair[2], pair[1]] <- r_squared
}
print(paste("R^2 matrix for taxon:", taxon))
print(r_squared_matrix)
# Plot R^2 matrix as a heatmap
corrplot(r_squared_matrix, method = "color", addCoef.col = "black", tl.col =
"black", tl.srt = 45, title = paste("R^2 Matrix for", taxon), mar=c(0,0,1,0))
}
[1] "R^2 matrix for taxon: PAN"
total_digit1 total_digit2 total_digit3 total_digit4 total_digit5
total_digit1 NA 0 0 0 0
total_digit2 0 NA 0 0 0
total_digit3 0 0 NA 0 0
total_digit4 0 0 0 NA 0
total_digit5 0 0 0 0 NA
[1] "R^2 matrix for taxon: OWM"
total_digit1 total_digit2 total_digit3 total_digit4 total_digit5
total_digit1 NA 0.5894414 0.5515566 0.5649344 0.5957257
total_digit2 0.5894414 NA 0.9612788 0.9602187 0.9448672
total_digit3 0.5515566 0.9612788 NA 0.9960620 0.9811747
total_digit4 0.5649344 0.9602187 0.9960620 NA 0.9880566
total_digit5 0.5957257 0.9448672 0.9811747 0.9880566 NA
[1] "R^2 matrix for taxon: COLUGO"
total_digit1 total_digit2 total_digit3 total_digit4 total_digit5
total_digit1 NA 0 0 0 0
total_digit2 0 NA 0 0 0
total_digit3 0 0 NA 0 0
total_digit4 0 0 0 NA 0
total_digit5 0 0 0 0 NA
[1] "R^2 matrix for taxon: NWM"
total_digit1 total_digit2 total_digit3 total_digit4 total_digit5
total_digit1 NA 0.9646182 0.9528415 0.9295581 0.9203128
total_digit2 0.9646182 NA 0.9846387 0.9588601 0.9485791
total_digit3 0.9528415 0.9846387 NA 0.9862927 0.9679882
total_digit4 0.9295581 0.9588601 0.9862927 NA 0.9884083
total_digit5 0.9203128 0.9485791 0.9679882 0.9884083 NA
[1] "R^2 matrix for taxon: SCANDENTIA"
total_digit1 total_digit2 total_digit3 total_digit4 total_digit5
total_digit1 NA 0.9243360 0.8907543 0.9102392 0.9646054
total_digit2 0.9243360 NA 0.9817450 0.9766877 0.9619513
total_digit3 0.8907543 0.9817450 NA 0.9959032 0.9393150
total_digit4 0.9102392 0.9766877 0.9959032 NA 0.9491674
total_digit5 0.9646054 0.9619513 0.9393150 0.9491674 NA
[1] "R^2 matrix for taxon: LORISIFORM"
total_digit1 total_digit2 total_digit3 total_digit4 total_digit5
total_digit1 NA 0.5817319 0.7835986 0.9149144 0.8244058
total_digit2 0.5817319 NA 0.9060961 0.6307979 0.3938864
total_digit3 0.7835986 0.9060961 NA 0.8485835 0.6239906
total_digit4 0.9149144 0.6307979 0.8485835 NA 0.9098780
total_digit5 0.8244058 0.3938864 0.6239906 0.9098780 NA
[1] "R^2 matrix for taxon: LEMURID"
total_digit1 total_digit2 total_digit3 total_digit4 total_digit5
total_digit1 NA 0.9513217 0.9244045 0.9532332 0.9753546
total_digit2 0.9513217 NA 0.9320955 0.9455698 0.9565207
total_digit3 0.9244045 0.9320955 NA 0.9879778 0.9401933
total_digit4 0.9532332 0.9455698 0.9879778 NA 0.9774962
total_digit5 0.9753546 0.9565207 0.9401933 0.9774962 NA
NA
Overall: - 4&5, 3&4, 1&5 have highest correlation - 3&5 least correlated
unique_taxa <- unique(autoclean$aagroup)
for (taxon in unique_taxa) {
taxa_data <- autoclean[autoclean$aagroup == taxon, ]
if (nrow(taxa_data) > 2 && all(complete.cases(taxa_data[, c("total_digit1",
"total_digit2",
"total_digit3",
"total_digit4",
"total_digit5")]))) {
correlation_matrix <- cor(taxa_data[, c("total_digit1", "total_digit2",
"total_digit3", "total_digit4",
"total_digit5")])
print(taxon)
print(correlation_matrix)
corrplot(correlation_matrix, method = "color", addCoef.col = "black", tl.col =
"black", tl.srt = 45, title=taxon, mar=c(0,0,1,0) )
}
}
[1] "OWM"
total_digit1 total_digit2 total_digit3 total_digit4 total_digit5
total_digit1 1.0000000 0.7677508 0.7426686 0.7516212 0.7718327
total_digit2 0.7677508 1.0000000 0.9804483 0.9799075 0.9720428
total_digit3 0.7426686 0.9804483 1.0000000 0.9980291 0.9905426
total_digit4 0.7516212 0.9799075 0.9980291 1.0000000 0.9940104
total_digit5 0.7718327 0.9720428 0.9905426 0.9940104 1.0000000
[1] "NWM"
total_digit1 total_digit2 total_digit3 total_digit4 total_digit5
total_digit1 1.0000000 0.9821498 0.9761360 0.9641359 0.9593294
total_digit2 0.9821498 1.0000000 0.9922896 0.9792140 0.9739503
total_digit3 0.9761360 0.9922896 1.0000000 0.9931227 0.9838639
total_digit4 0.9641359 0.9792140 0.9931227 1.0000000 0.9941873
total_digit5 0.9593294 0.9739503 0.9838639 0.9941873 1.0000000
[1] "SCANDENTIA"
total_digit1 total_digit2 total_digit3 total_digit4 total_digit5
total_digit1 1.0000000 0.9614239 0.9437978 0.9540646 0.9821433
total_digit2 0.9614239 1.0000000 0.9908305 0.9882751 0.9807912
total_digit3 0.9437978 0.9908305 1.0000000 0.9979495 0.9691826
total_digit4 0.9540646 0.9882751 0.9979495 1.0000000 0.9742522
total_digit5 0.9821433 0.9807912 0.9691826 0.9742522 1.0000000
[1] "LORISIFORM"
total_digit1 total_digit2 total_digit3 total_digit4 total_digit5
total_digit1 1.0000000 0.7627135 0.8852110 0.9565116 0.9079680
total_digit2 0.7627135 1.0000000 0.9518908 0.7942279 0.6276037
total_digit3 0.8852110 0.9518908 1.0000000 0.9211859 0.7899308
total_digit4 0.9565116 0.7942279 0.9211859 1.0000000 0.9538753
total_digit5 0.9079680 0.6276037 0.7899308 0.9538753 1.0000000
[1] "LEMURID"
total_digit1 total_digit2 total_digit3 total_digit4 total_digit5
total_digit1 1.0000000 0.9753572 0.9614595 0.9763366 0.9876004
total_digit2 0.9753572 1.0000000 0.9654509 0.9724041 0.9780188
total_digit3 0.9614595 0.9654509 1.0000000 0.9939707 0.9696357
total_digit4 0.9763366 0.9724041 0.9939707 1.0000000 0.9886841
total_digit5 0.9876004 0.9780188 0.9696357 0.9886841 1.0000000